library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd) # install from source: https://github.com/PDiracDelta/CONSTANd/
This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.
We will check how varying analysis components [summarization/normalization/differential abundance testing methods] changes end results of a quantitative proteomic study.
source('./other_functions.R')
source('./plotting_functions.R')
# you should either make a symbolic link in this directory
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# dat.w <- data.list$dat.w # data in wide format
if ('X' %in% colnames(dat.l)) { dat.l$X <- NULL }
# remove shared peptides
shared.peptides <- dat.l %>% filter(!shared.peptide)
# keep spectra with isolation interference <30 and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
# which peptides were identified in each MS run?
unique.pep=dat.l %>%
group_by(Run) %>%
distinct(Peptide) %>%
mutate(val=1)
unique.pep <- xtabs(val~Peptide+Run, data=unique.pep)
tmp <- apply(unique.pep, 1, function(x) all(x==1))
inner.peptides <- rownames(unique.pep)[tmp]
#TEMP -remove!
# tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# dat.l <- dat.l %>% filter(Protein %in% c(tmp[sample(1:length(tmp), size=20)], spiked.proteins))
# specify # of varying component variants and their names
variant.names <- c('log2Intensity', 'Intensity', 'Ratio')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw', 'raw') # ratios are considered raw, because they are basically mean-normalized intensities
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
referenceChannels <- 'Norm'
# specify colours corresponding to biological conditions
condition.colour <- tribble(
~Condition, ~Colour,
"0.125", 'black',
"0.5", 'blue',
"0.667", 'green',
"1", 'red',
"Norm", 'orange')
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Colour)
sample.info <- get_sample_info(dat.l, condition.colour)
channelNames <- remove_factors(unique(sample.info$Channel))
Which scale are the reporter ion intensities on?
dat.unit.l <- emptyList(variant.names)
dat.unit.l[[1]] <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)
dat.unit.l[[2]] <- dat.l %>% rename(response=Intensity)
Calculate ratio’s (per PSM) with respect to average intensity within run, or in other words: each value is divided by the row mean.
# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% filter(Condition==referenceCondition) %>% distinct(Channel) %>% pull
denom.df=dat.l %>% filter(Condition==referenceCondition) %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='Intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$Ratio <- dat.l %>% left_join(denom.df[c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')], by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=Intensity/denom) %>% select(-c(Intensity, denom))
grand_median <- unlist(lapply(dat.unit.l, function(x) median(x$response, na.rm=TRUE)))
# switch to wide format
dat.unit.w <- lapply(dat.unit.l, function(x) {
pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
})
# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w <- dat.unit.w
dat.norm.w <- lapply(dat.norm.w, function(x) {
x[,channelNames] <- sweep(x[,channelNames], 1, apply(x[,channelNames], 1, median, na.rm=T))
return(x) } )
Summarize quantification values from PSM to peptide (first step) to protein (second step).
# normalized data
dat.norm.summ.w <- lapply(dat.norm.w, function(x) {
# group by (run,)protein,peptide then summarize twice (once on each level)
y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
return(y)
})
Notice that the row sums are not equal to Ncols anymore, because the median summarization does not preserve them (but mean summarization does).
Let’s also summarize the non-normalized data for comparison in the next section.
# non-normalized data
dat.nonnorm.summ.w <- lapply(dat.unit.w, function(x) {
# group by (run,)protein,peptide then summarize twice (once on each level)
y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
return(y)
})
# medianSweeping: in each channel, subtract median computed across all proteins within the channel
# do the above separately for each MS run
dat.norm.summ.w <- lapply(dat.norm.summ.w, function(x) {
x.split <- split(x, x$Run)
x.split.norm <- lapply(x.split, function(y) {
y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median, na.rm=T) )
return(y) } )
dat.norm.summ.w <- bind_rows(x.split.norm)
} )
# make data completely wide (also across runs)
## non-normalized data
dat.nonnorm.summ.w2 <- lapply(dat.nonnorm.summ.w, function(x) {
return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})
## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})
# use (half-)wide format
for (i in 1: n.comp.variants){
par(mfrow=c(1,2))
boxplot_w(dat.nonnorm.summ.w[[i]],sample.info, paste('Raw', variant.names[i], sep='_'))
boxplot_w(dat.norm.summ.w[[i]], sample.info, paste('Normalized', variant.names[i], sep='_'))
par(mfrow=c(1,1))
}
MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).
# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
# use wide2 format
for (i in 1: n.comp.variants){
p1 <- maplot_ils(dat.nonnorm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Raw', variant.names[i], sep='_'), spiked.proteins)
p2 <- maplot_ils(dat.norm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)
}
1 and condition 0.125 (quantification values averaged within condition).# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull
for (i in 1: n.comp.variants){
p1 <- maplot_ils(dat.nonnorm.summ.w2[[i]], channels.num, channels.denom, scale=scale.vec[i], paste('Raw', variant.names[i], sep='_'), spiked.proteins)
p2 <- maplot_ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale=scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)
}
dat.nonnorm.summ.l <- lapply(dat.nonnorm.summ.w, function(x){
x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
x <- to_long_format(x, sample.info)
})
dat.norm.summ.l <- lapply(dat.norm.summ.w, function(x){
x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
x <- to_long_format(x, sample.info)
})
par(mfrow=c(1, 2))
for (i in 1: n.comp.variants){
cvplot_ils(dat=dat.nonnorm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition',
title=paste('Raw', variant.names[i], sep='_'))
cvplot_ils(dat=dat.norm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition',
title=paste('Normalized', variant.names[i], sep='_'), add.constant=grand_median[i])
}
par(mfrow=c(1, 1))
par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.w2)){
pcaplot_ils(dat.nonnorm.summ.w2[[i]] %>% select(-'Protein'), info=sample.info, paste('Raw', variant.names[i], sep='_'))
pcaplot_ils(dat.norm.summ.w2[[i]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}
par(mfrow=c(1, 1))
for (i in seq_along(dat.norm.summ.w2)){
par(mfrow=c(1, 2))
pcaplot_ils(dat.nonnorm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Raw', variant.names[i], sep='_'))
pcaplot_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
par(mfrow=c(1, 1))
}
for (i in seq_along(dat.norm.summ.w2)){
par(mfrow=c(1, 2))
dendrogram_ils(dat.nonnorm.summ.w2[[i]] %>% select(-Protein), info=sample.info, paste('Raw', variant.names[i], sep='_'))
dendrogram_ils(dat.norm.summ.w2[[i]] %>% select(-Protein), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
par(mfrow=c(1, 1))
}
# drop referenChannels
chr <- sample.info %>% filter(Condition==referenceChannels) %>% distinct(Run:Channel) %>% pull %>% as.character
dat.norm.summ.w2 <- lapply(dat.norm.summ.w2, function(x) x %>% select(-one_of(chr)))
sample.info <- sample.info %>% filter(!(Condition %in% referenceChannels)) %>% droplevels
NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).
#{ INVESTIGATE late log2 transform
dat.norm.summ.w2$Intensity_lateLog2 <- dat.norm.summ.w2$Intensity
channelNames.prefixed <- colnames(dat.norm.summ.w2$Intensity %>% select(-Protein))
dat.norm.summ.w2$Intensity_lateLog2[,channelNames.prefixed] <- log2(dat.norm.summ.w2$Intensity[,channelNames.prefixed])
variant.names <- names(dat.norm.summ.w2)
scale.vec <- c(scale.vec, 'log')
n.comp.variants <- n.comp.variants + 1
#}
design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[i]]), 'Protein')
dat.dea[[i]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}
cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
| background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|
| not_DE | 4056 | 4 | 4057 | 13 | 4055 | 7 | 3862 | 4 |
| DE | 3 | 15 | 2 | 6 | 4 | 12 | 2 | 0 |
| log2Intensity | Intensity | Ratio | Intensity_lateLog2 | |
|---|---|---|---|---|
| Accuracy | 0.998 | 0.996 | 0.997 | 0.998 |
| Sensitivity | 0.789 | 0.316 | 0.632 | 0.000 |
| Specificity | 0.999 | 1.000 | 0.999 | 0.999 |
| PPV | 0.833 | 0.750 | 0.750 | 0.000 |
| NPV | 0.999 | 0.997 | 0.998 | 0.999 |
| background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|
| not_DE | 4059 | 17 | 4059 | 18 | 4058 | 16 | 3773 | 19 |
| DE | 0 | 2 | 0 | 1 | 1 | 3 | 3 | 0 |
| log2Intensity | Intensity | Ratio | Intensity_lateLog2 | |
|---|---|---|---|---|
| Accuracy | 0.996 | 0.996 | 0.996 | 0.994 |
| Sensitivity | 0.105 | 0.053 | 0.158 | 0.000 |
| Specificity | 1.000 | 1.000 | 1.000 | 0.999 |
| PPV | 1.000 | 1.000 | 0.750 | 0.000 |
| NPV | 0.996 | 0.996 | 0.996 | 0.995 |
| background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|
| not_DE | 4054 | 5 | 4058 | 10 | 4054 | 4 | 3561 | 11 |
| DE | 5 | 14 | 1 | 9 | 5 | 15 | 0 | 1 |
| log2Intensity | Intensity | Ratio | Intensity_lateLog2 | |
|---|---|---|---|---|
| Accuracy | 0.998 | 0.997 | 0.998 | 0.997 |
| Sensitivity | 0.737 | 0.474 | 0.789 | 0.083 |
| Specificity | 0.999 | 1.000 | 0.999 | 1.000 |
| PPV | 0.737 | 0.900 | 0.750 | 1.000 |
| NPV | 0.999 | 0.998 | 0.999 | 0.997 |
# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)
scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins)
scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins)
for (i in 1:n.contrasts){
volcanoplot_ils(dat.dea, i, spiked.proteins)
}
Let’s see whether the spiked protein fold changes make sense
# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] CONSTANd_0.99.0 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [6] readr_1.4.0 tidyr_1.1.2 tibble_3.0.4 tidyverse_1.3.0 psych_2.0.9
## [11] limma_3.44.3 kableExtra_1.3.1 dendextend_1.14.0 gridExtra_2.3 stringi_1.5.3
## [16] ggplot2_3.3.2 rmarkdown_2.5
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-149 fs_1.5.0 lubridate_1.7.9 webshot_0.5.2 httr_1.4.2
## [6] JGmisc_0.3 tools_4.0.3 backports_1.1.10 R6_2.5.0 rpart_4.1-15
## [11] DBI_1.1.0 mgcv_1.8-33 colorspace_1.4-1 nnet_7.3-14 withr_2.3.0
## [16] tidyselect_1.1.0 mnormt_2.0.2 compiler_4.0.3 cli_2.1.0 rvest_0.3.6
## [21] xml2_1.3.2 labeling_0.4.2 scales_1.1.1 digest_0.6.26 pkgconfig_2.0.3
## [26] htmltools_0.5.0 highr_0.8 dbplyr_2.0.0 rlang_0.4.8 readxl_1.3.1
## [31] rstudioapi_0.12 farver_2.0.3 generics_0.1.0 jsonlite_1.7.1 ModelMetrics_1.2.2.2
## [36] magrittr_1.5 Matrix_1.2-18 Rcpp_1.0.5 munsell_0.5.0 fansi_0.4.1
## [41] viridis_0.5.1 lifecycle_0.2.0 pROC_1.16.2 yaml_2.2.1 MASS_7.3-53
## [46] plyr_1.8.6 recipes_0.1.15 grid_4.0.3 parallel_4.0.3 crayon_1.3.4
## [51] lattice_0.20-41 haven_2.3.1 splines_4.0.3 hms_0.5.3 tmvnsim_1.0-2
## [56] knitr_1.30 pillar_1.4.6 reshape2_1.4.4 codetools_0.2-16 stats4_4.0.3
## [61] reprex_0.3.0 glue_1.4.2 evaluate_0.14 data.table_1.13.2 modelr_0.1.8
## [66] vctrs_0.3.4 foreach_1.5.1 cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1
## [71] xfun_0.18 gower_0.2.2 prodlim_2019.11.13 broom_0.7.2 e1071_1.7-4
## [76] class_7.3-17 survival_3.2-7 viridisLite_0.3.0 timeDate_3043.102 iterators_1.0.13
## [81] lava_1.6.8.1 ellipsis_0.3.1 caret_6.0-86 ipred_0.9-9